# 给一个文本中的句子用结巴进行分词
import jieba


def handle(in_file, out_file):
    fp = open(in_file, 'r', encoding='utf-8')
    fout = open(out_file, 'w', encoding='utf-8')
    lines = fp.readlines()
    count = 0
    for line in lines:
        sentence = line.split("\t")[1]
        # 精准模式切割
        word_list = jieba.cut(sentence, cut_all=False)
        new_sentence = ' '.join(list(word_list))
        count += 1
        if count % 10000 == 0:
            print("执行到了第%d条" % count)
        fout.write(new_sentence)
    print("总数：", count)


if __name__ == "__main__":
    # 原文
    infile = 'D:/Projectspace/corpus/cnews/cnews.val.txt'
    # 输出切割的句子
    outfile = 'D:/Projectspace/corpus/cnews/cnews.val_cut.txt'
    handle(infile, outfile)
